Data preparation
Dataset - German Housing Data
Origin data set cleaned:
Variable rooms, bathrooms, bedrooms, floors, garages, Year_built, Year_renovated changed from decimal to integer Subset with buildings up to 20 rooms for our analysis created Rooms with .5 rounded up Levels of the variable Energy_source and Garagetype transformed.
Load of the cleaned dataset
data <- read.csv('german_housing_cleaned.csv',header =T, encoding='UTF-8')
head(data)
## Price Type Living_space Lot Rooms Bedrooms Bathrooms Floors
## 1 498000 Multiple dwelling 106.00 229 5 3 1 2
## 2 495000 Mid-terrace house 140.93 517 6 3 2 NA
## 3 749000 Farmhouse 162.89 82 5 3 2 4
## 4 259000 Farmhouse 140.00 814 4 NA 2 2
## 5 469000 Multiple dwelling 115.00 244 4 2 1 NA
## 6 1400000 Mid-terrace house 310.00 860 8 NA NA 3
## Year_built Furnishing_quality Year_renovated Condition Heating
## 1 2005 normal NA refurbished central heating
## 2 1994 basic NA refurbished stove heating
## 3 2013 NA dilapidated stove heating
## 4 1900 basic 2000 fixer-upper central heating
## 5 1968 refined 2019 refurbished central heating
## 6 1969 basic NA maintained
## Energy_source Energy_efficiency_class State City
## 1 natural gas D Baden-Württemberg Bodenseekreis
## 2 Baden-Württemberg Konstanz (Kreis)
## 3 district heating B Baden-Württemberg Esslingen (Kreis)
## 4 electricity G Baden-Württemberg Waldshut (Kreis)
## 5 oil F Baden-Württemberg Esslingen (Kreis)
## 6 oil Baden-Württemberg Stuttgart
## Place Garages Garagetype
## 1 Bermatingen 2 Parking lot
## 2 Engen 7 Parking lot
## 3 Ostfildern 1 Garage
## 4 Bonndorf im Schwarzwald 1 Garage
## 5 Leinfelden-Echterdingen 1 Garage
## 6 Süd 2 Garage
Inspection of all variables
str(data)
summary(data)
colnames(data)
apply(data,2,function(x) sum(is.na(x)))
Analysis of the distribution of the variables: ‘Price’, ‘Living_space’, ‘Rooms’ and ‘Lot’
par(mfrow = c(2,2))
#Price
plot(density(data$Price))
#Living_space
plot(density(data$Living_space))
#Rooms
hist(data$Rooms)
#Lot
plot(density(data$Lot))
Result: The variables are right skewed
Log Transformation of variables
Therefore we will use the Log Transformation of the variables ‘Price’, ‘Living_space’, ‘Rooms’, ‘Lot’ to get a nearly normal distribution
par(mfrow = c(2,2))
#Price
price.log <- density(log(data$Price))
plot(price.log)
#Living_space
living.log <- density(log(data$Living_space))
plot(living.log)
#Rooms
rooms.log <- log(data$Rooms)
hist(rooms.log)
#Lot
lot.log <- density(log(data$Lot))
plot(lot.log)
Add new columns with log transformed variables price, living space and rooms
data1 <- data
data1$log.price <- log(data1$Price)
data1$log.living <- log(data1$Living_space)
data1$log.rooms <- log(data1$Rooms)
data1$log.lot <- log(data1$Lot)
data1$Condition <- factor(data1$Condition)
str(data1)
## 'data.frame': 10318 obs. of 24 variables:
## $ Price : num 498000 495000 749000 259000 469000 1400000 3500000 630000 364000 1750000 ...
## $ Type : chr "Multiple dwelling" "Mid-terrace house" "Farmhouse" "Farmhouse" ...
## $ Living_space : num 106 141 163 140 115 ...
## $ Lot : num 229 517 82 814 244 860 5300 406 973 1460 ...
## $ Rooms : int 5 6 5 4 4 8 13 10 10 6 ...
## $ Bedrooms : num 3 3 3 NA 2 NA NA NA 4 4 ...
## $ Bathrooms : num 1 2 2 2 1 NA 4 NA 4 2 ...
## $ Floors : num 2 NA 4 2 NA 3 NA 3 2 3 ...
## $ Year_built : num 2005 1994 2013 1900 1968 ...
## $ Furnishing_quality : chr "normal" "basic" "" "basic" ...
## $ Year_renovated : num NA NA NA 2000 2019 ...
## $ Condition : Factor w/ 8 levels "","as new","dilapidated",..: 8 8 3 6 8 7 3 8 8 8 ...
## $ Heating : chr "central heating" "stove heating" "stove heating" "central heating" ...
## $ Energy_source : chr "natural gas" "" "district heating" "electricity" ...
## $ Energy_efficiency_class: chr "D" "" "B" "G" ...
## $ State : chr "Baden-Württemberg" "Baden-Württemberg" "Baden-Württemberg" "Baden-Württemberg" ...
## $ City : chr "Bodenseekreis" "Konstanz (Kreis)" "Esslingen (Kreis)" "Waldshut (Kreis)" ...
## $ Place : chr "Bermatingen" "Engen" "Ostfildern" "Bonndorf im Schwarzwald" ...
## $ Garages : num 2 7 1 1 1 2 7 2 8 2 ...
## $ Garagetype : chr "Parking lot" "Parking lot" "Garage" "Garage" ...
## $ log.price : num 13.1 13.1 13.5 12.5 13.1 ...
## $ log.living : num 4.66 4.95 5.09 4.94 4.74 ...
## $ log.rooms : num 1.61 1.79 1.61 1.39 1.39 ...
## $ log.lot : num 5.43 6.25 4.41 6.7 5.5 ...
options(scipen=999) #block scientific notation
library(ggplot2)
attach(data)
We plot the response variable “Price” against the predictor “Living_Space” to get a first impression and grahical analysis.
#Living_space
ggplot(data1, aes(log.living, log.price)) + geom_point() + geom_smooth(method = lm, se = T, color = 'red') + ggtitle('Scatterplot with regression line for log(Price) against log.living')
#linear model
lm.log.price_living <- lm(log.price ~ log.living, data = data1)
summary(lm.log.price_living)
##
## Call:
## lm(formula = log.price ~ log.living, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7836 -0.4079 0.0856 0.4683 2.7581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.17522 0.07596 107.62 <0.0000000000000002 ***
## log.living 0.90280 0.01455 62.05 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7145 on 10316 degrees of freedom
## Multiple R-squared: 0.2718, Adjusted R-squared: 0.2717
## F-statistic: 3850 on 1 and 10316 DF, p-value: < 0.00000000000000022
#estimated regression coefficients
coef(lm.log.price_living)
## (Intercept) log.living
## 8.1752232 0.9028032
##linear model
lm.log.price_living_type <- lm(data = data1, log.price ~ log.living + Type)
summary(lm.log.price_living_type)
##
## Call:
## lm(formula = log.price ~ log.living + Type, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9802 -0.3849 0.0716 0.4556 2.7356
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.47225 0.09409 79.414 < 0.0000000000000002 ***
## log.living 0.99691 0.01679 59.378 < 0.0000000000000002 ***
## TypeBungalow 0.24691 0.05737 4.304 0.000016956457415 ***
## TypeCastle -0.33356 0.31169 -1.070 0.284573
## TypeCorner house -0.14759 0.06058 -2.436 0.014857 *
## TypeDuplex -0.06320 0.03900 -1.620 0.105186
## TypeFarmhouse 0.26026 0.04599 5.659 0.000000015603919 ***
## TypeMid-terrace house 0.24470 0.03679 6.651 0.000000000030494 ***
## TypeMultiple dwelling 0.35983 0.05029 7.155 0.000000000000891 ***
## TypeResidential property 0.17411 0.05094 3.418 0.000634 ***
## TypeSingle dwelling 0.39721 0.04091 9.708 < 0.0000000000000002 ***
## TypeSpecial property 0.46909 0.05103 9.193 < 0.0000000000000002 ***
## TypeVilla 0.70309 0.05077 13.847 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6912 on 10305 degrees of freedom
## Multiple R-squared: 0.3193, Adjusted R-squared: 0.3185
## F-statistic: 402.8 on 12 and 10305 DF, p-value: < 0.00000000000000022
#estimated regression coefficients
coef(lm.log.price_living_type)
## (Intercept) log.living TypeBungalow
## 7.4722543 0.9969080 0.2469139
## TypeCastle TypeCorner house TypeDuplex
## -0.3335592 -0.1475928 -0.0631954
## TypeFarmhouse TypeMid-terrace house TypeMultiple dwelling
## 0.2602565 0.2447012 0.3598267
## TypeResidential property TypeSingle dwelling TypeSpecial property
## 0.1741124 0.3972086 0.4690858
## TypeVilla
## 0.7030881
#intercept of Type "NULL"
coef(lm.log.price_living_type)['(Intercept)']
## (Intercept)
## 7.472254
#intercept of Type single dwelling
coef(lm.log.price_living_type)['(Intercept)'] + coef(lm.log.price_living_type)['TypeSingle dwelling']
## (Intercept)
## 7.869463
##linear model
lm.log.price_living_type2 <- lm(data = data1, log.price ~ log.living * Type)
summary(lm.log.price_living_type2)
##
## Call:
## lm(formula = log.price ~ log.living * Type, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0574 -0.3835 0.0672 0.4576 2.7139
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 7.73609 0.27923 27.705
## log.living 0.94614 0.05331 17.749
## TypeBungalow 0.92173 0.48628 1.895
## TypeCastle -1.04467 5.02138 -0.208
## TypeCorner house -0.80053 0.58144 -1.377
## TypeDuplex 1.34421 0.35134 3.826
## TypeFarmhouse 0.63145 0.53470 1.181
## TypeMid-terrace house -0.68139 0.31354 -2.173
## TypeMultiple dwelling 0.24793 0.69059 0.359
## TypeResidential property -0.31372 0.45320 -0.692
## TypeSingle dwelling -0.75763 0.42213 -1.795
## TypeSpecial property -0.77518 0.45681 -1.697
## TypeVilla 0.06219 0.60861 0.102
## log.living:TypeBungalow -0.11718 0.08861 -1.322
## log.living:TypeCastle 0.12151 0.79328 0.153
## log.living:TypeCorner house 0.12396 0.10936 1.133
## log.living:TypeDuplex -0.24967 0.06560 -3.806
## log.living:TypeFarmhouse -0.08080 0.10831 -0.746
## log.living:TypeMid-terrace house 0.18009 0.06011 2.996
## log.living:TypeMultiple dwelling 0.01967 0.13964 0.141
## log.living:TypeResidential property 0.09284 0.08538 1.087
## log.living:TypeSingle dwelling 0.23218 0.08354 2.779
## log.living:TypeSpecial property 0.25164 0.09095 2.767
## log.living:TypeVilla 0.11600 0.10750 1.079
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## log.living < 0.0000000000000002 ***
## TypeBungalow 0.058061 .
## TypeCastle 0.835198
## TypeCorner house 0.168607
## TypeDuplex 0.000131 ***
## TypeFarmhouse 0.237658
## TypeMid-terrace house 0.029787 *
## TypeMultiple dwelling 0.719593
## TypeResidential property 0.488813
## TypeSingle dwelling 0.072720 .
## TypeSpecial property 0.089737 .
## TypeVilla 0.918609
## log.living:TypeBungalow 0.186066
## log.living:TypeCastle 0.878261
## log.living:TypeCorner house 0.257038
## log.living:TypeDuplex 0.000142 ***
## log.living:TypeFarmhouse 0.455705
## log.living:TypeMid-terrace house 0.002741 **
## log.living:TypeMultiple dwelling 0.887975
## log.living:TypeResidential property 0.276883
## log.living:TypeSingle dwelling 0.005460 **
## log.living:TypeSpecial property 0.005673 **
## log.living:TypeVilla 0.280593
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6879 on 10294 degrees of freedom
## Multiple R-squared: 0.3264, Adjusted R-squared: 0.3249
## F-statistic: 216.9 on 23 and 10294 DF, p-value: < 0.00000000000000022
#estimated regression coefficients
coef(lm.log.price_living_type2)
## (Intercept) log.living
## 7.73608549 0.94613962
## TypeBungalow TypeCastle
## 0.92172518 -1.04467031
## TypeCorner house TypeDuplex
## -0.80052753 1.34420839
## TypeFarmhouse TypeMid-terrace house
## 0.63144756 -0.68139164
## TypeMultiple dwelling TypeResidential property
## 0.24793247 -0.31371529
## TypeSingle dwelling TypeSpecial property
## -0.75763067 -0.77518375
## TypeVilla log.living:TypeBungalow
## 0.06219264 -0.11718366
## log.living:TypeCastle log.living:TypeCorner house
## 0.12151270 0.12396267
## log.living:TypeDuplex log.living:TypeFarmhouse
## -0.24967456 -0.08079852
## log.living:TypeMid-terrace house log.living:TypeMultiple dwelling
## 0.18008764 0.01967192
## log.living:TypeResidential property log.living:TypeSingle dwelling
## 0.09283769 0.23217929
## log.living:TypeSpecial property log.living:TypeVilla
## 0.25164379 0.11599769
# The "P-Values - confidence intervals" duality
confint(lm.log.price_living_type2)
## 2.5 % 97.5 %
## (Intercept) 7.18873463 8.28343635
## log.living 0.84164786 1.05063138
## TypeBungalow -0.03148560 1.87493595
## TypeCastle -10.88754235 8.79820173
## TypeCorner house -1.94027217 0.33921710
## TypeDuplex 0.65550987 2.03290692
## TypeFarmhouse -0.41667545 1.67957057
## TypeMid-terrace house -1.29599299 -0.06679028
## TypeMultiple dwelling -1.10576712 1.60163207
## TypeResidential property -1.20207438 0.57464379
## TypeSingle dwelling -1.58509414 0.06983281
## TypeSpecial property -1.67062159 0.12025410
## TypeVilla -1.13079934 1.25518461
## log.living:TypeBungalow -0.29088557 0.05651824
## log.living:TypeCastle -1.43346849 1.67649390
## log.living:TypeCorner house -0.09041307 0.33833842
## log.living:TypeDuplex -0.37826357 -0.12108555
## log.living:TypeFarmhouse -0.29311479 0.13151774
## log.living:TypeMid-terrace house 0.06226578 0.29790949
## log.living:TypeMultiple dwelling -0.25405926 0.29340310
## log.living:TypeResidential property -0.07451478 0.26019016
## log.living:TypeSingle dwelling 0.06841797 0.39594061
## log.living:TypeSpecial property 0.07335498 0.42993261
## log.living:TypeVilla -0.09472399 0.32671936
formula(lm.log.price_living_type)
## log.price ~ log.living + Type
summary(lm.log.price_living_type)$r.squared
## [1] 0.3192727
summary(lm.log.price_living_type)$adj.r.squared
## [1] 0.31848
formula(lm.log.price_living_type2)
## log.price ~ log.living * Type
summary(lm.log.price_living_type2)$r.squared
## [1] 0.3263949
summary(lm.log.price_living_type2)$adj.r.squared
## [1] 0.3248899
The function fitted() can be used to extract the predicted values for the existing observations
attach(data)
## The following objects are masked from data (pos = 3):
##
## Bathrooms, Bedrooms, City, Condition, Energy_efficiency_class,
## Energy_source, Floors, Furnishing_quality, Garages, Garagetype,
## Heating, Living_space, Lot, Place, Price, Rooms, State, Type,
## Year_built, Year_renovated
#lm.log.price_living
fitted.price_living <- fitted(lm.log.price_living)
str(fitted.price_living)
## Named num [1:10318] 12.4 12.6 12.8 12.6 12.5 ...
## - attr(*, "names")= chr [1:10318] "1" "2" "3" "4" ...
head(fitted.price_living)
## 1 2 3 4 5 6
## 12.38539 12.64253 12.77327 12.63655 12.45896 13.35422
plot(log(Price)~ log(Living_space), main = 'Model log(Price) ~ log(Living_space)', col = 'navy', pch = 16)
points(fitted.price_living ~ log(Living_space), col = 'red', pch = 16)
abline(lm.log.price_living, col = 'yellow', lwd = 2.5)
attach(data1)
resid.price_living <- resid(lm.log.price_living)
length(resid.price_living)
## [1] 10318
head(resid.price_living)
## 1 2 3 4 5 6
## 0.7329643 0.4697818 0.7532264 -0.1719706 0.5993948 0.7977636
set.seed(100)
id <- sample(x = 1:10318, size = 5)
resid.price_living[id]
## 3786 503 3430 3696 4090
## 0.5106768 -0.3315001 0.4470366 -0.9261093 1.0834033
fitted.price_living[id]
## 3786 503 3430 3696 4090
## 13.77484 12.69884 13.46378 12.41883 13.03221
plot(log(Price) ~ log(Living_space), main = 'Model log(Price) ~ log(Living_space)', col = 'navy', pch = 16)
abline(lm.log.price_living, col = 'green', lwd = 2.5)
points(log(Price) ~ log(Living_space), data = data1[id, ], col = 'red', pch = 4, lwd = 5)
segments(x0 = data1[id, 'log.living'], x1 = data1[id, 'log.living'],
y0 = fitted.price_living[id], y1 = data1[id, 'log.price'], col = 'yellow', lwd = 2)
## Predicting values using splitted data set 80:20 ratio
#split dataset
split80 <- round(nrow(data1)* 0.80)
train <- data1[1:split80,]
test <- data1[(split80 + 1):nrow(data1),]
dim(train)
## [1] 8254 24
dim(test)
## [1] 2064 24
#linear regression model
lm.train <- lm(log.price ~ log.living, data = train)
summary(lm.train)
##
## Call:
## lm(formula = log.price ~ log.living, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8765 -0.3774 0.0557 0.4283 2.6734
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.45284 0.07981 105.91 <0.0000000000000002 ***
## log.living 0.86819 0.01524 56.96 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6563 on 8252 degrees of freedom
## Multiple R-squared: 0.2822, Adjusted R-squared: 0.2821
## F-statistic: 3245 on 1 and 8252 DF, p-value: < 0.00000000000000022
#predictions
pred.new.living <- predict(object = lm.train, newdata = test)
pred.new.living.CI <- predict(object = lm.train, interval = 'prediction', newdata = test)
#display predictions
plot(log.price ~log.living, data = data1, main = 'Prediction with Model log(Price) ~ log(Living_space)', col = 'navy', pch = 16)
points(x = test$log.living, y= pred.new.living, col = 'red', pch = 16, cex = 1.5)
abline(lm.train, col = 'yellow', lwd = 2.5)
plot(log.price ~ log.living, data = train, main = 'Prediction with Model log(Price) ~ log(Living_space)', col = 'navy', pch = 16)
segments(x0 = test$log.living, x1 = test$log.living,
y0 = pred.new.living.CI[, 'lwr'], y1 = pred.new.living.CI[, 'upr'], lwd = 2, col = 'green')
points(x = test$log.living, y= pred.new.living.CI[,'fit'], col = 'red', pch = 16, cex =1.5)
abline(lm.train, col = 'yellow', lwd = 2.5)
condition.box.with_outlier <- ggplot(data1, aes(x=Condition, y=log.price)) + geom_boxplot(outlier.colour = 'red')+ theme(axis.text.x = element_text(angle = 90)) + ggtitle('Boxplots of log(Price) against Condition with outliers in red')
plot(condition.box.with_outlier)
#model
lm.price_condition.1 <- lm(log.price ~ Condition, data = data1)
summary(lm.price_condition.1)
##
## Call:
## lm(formula = log.price ~ Condition, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6043 -0.4304 0.0333 0.4884 3.6247
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 12.95695 0.04213 307.553
## Conditionas new -1.01885 0.24696 -4.126
## Conditiondilapidated 0.47108 0.04838 9.738
## Conditionfirst occupation 0.19331 0.09330 2.072
## Conditionfirst occupation after refurbishment -0.05091 0.05685 -0.895
## Conditionfixer-upper -0.10881 0.05398 -2.016
## Conditionmaintained -0.42435 0.04906 -8.649
## Conditionrefurbished -0.14229 0.04328 -3.288
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## Conditionas new 0.0000373 ***
## Conditiondilapidated < 0.0000000000000002 ***
## Conditionfirst occupation 0.03829 *
## Conditionfirst occupation after refurbishment 0.37055
## Conditionfixer-upper 0.04384 *
## Conditionmaintained < 0.0000000000000002 ***
## Conditionrefurbished 0.00101 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8071 on 10310 degrees of freedom
## Multiple R-squared: 0.07146, Adjusted R-squared: 0.07083
## F-statistic: 113.4 on 7 and 10310 DF, p-value: < 0.00000000000000022
#coefficients
coef(lm.price_condition.1)
## (Intercept)
## 12.95694626
## Conditionas new
## -1.01884973
## Conditiondilapidated
## 0.47108462
## Conditionfirst occupation
## 0.19330741
## Conditionfirst occupation after refurbishment
## -0.05090966
## Conditionfixer-upper
## -0.10881029
## Conditionmaintained
## -0.42434950
## Conditionrefurbished
## -0.14228677
aggregate(log.price ~Condition,
FUN = mean, data = data1)
## Condition log.price
## 1 12.95695
## 2 as new 11.93810
## 3 dilapidated 13.42803
## 4 first occupation 13.15025
## 5 first occupation after refurbishment 12.90604
## 6 fixer-upper 12.84814
## 7 maintained 12.53260
## 8 refurbished 12.81466
#model without slope, only intercept
lm.price_condition.0 <- lm(log.price ~ 1, data = data1)
summary(lm.price_condition.0)
##
## Call:
## lm(formula = log.price ~ 1, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6576 -0.4388 0.0287 0.5166 3.5125
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.867983 0.008243 1561 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8373 on 10317 degrees of freedom
coef(lm.price_condition.0)
## (Intercept)
## 12.86798
#Anova
anova(lm.price_condition.0, lm.price_condition.1)
## Analysis of Variance Table
##
## Model 1: log.price ~ 1
## Model 2: log.price ~ Condition
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 10317 7232.5
## 2 10310 6715.7 7 516.86 113.36 < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#post-hoc contrasts
library(multcomp)
unique(data1$Condition)
## [1] refurbished dilapidated
## [3] fixer-upper maintained
## [5] as new
## [7] first occupation after refurbishment first occupation
## 8 Levels: as new dilapidated ... refurbished
ph.test.1 <- glht(model = lm.price_condition.1, linfct = mcp(Condition = c('refurbished - dilapidated = 0')))
summary(ph.test.1)
##
## Simultaneous Tests for General Linear Hypotheses
##
## Multiple Comparisons of Means: User-defined Contrasts
##
##
## Fit: lm(formula = log.price ~ Condition, data = data1)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## refurbished - dilapidated == 0 -0.61337 0.02576 -23.81 <0.0000000000000002
##
## refurbished - dilapidated == 0 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
R uses “treatment contrasts” and therefore the Intercept refers to the first in alphabetical order, here “Null”. The other coefficients represent the difference.
#str(data1)
Year_built1 <- as.integer(data1$Year_built)
floor1 <- as.integer(data1$Floors)
typeof(Year_built1)
## [1] "integer"
lm.price_condition.2 <- update(lm.price_condition.1,. ~ . + Type + log.rooms + State +Energy_efficiency_class + Year_built1 + Furnishing_quality + floor1)
formula(lm.price_condition.2)
## log.price ~ Condition + Type + log.rooms + State + Energy_efficiency_class +
## Year_built1 + Furnishing_quality + floor1
drop1(lm.price_condition.2, test = "F")
## Single term deletions
##
## Model:
## log.price ~ Condition + Type + log.rooms + State + Energy_efficiency_class +
## Year_built1 + Furnishing_quality + floor1
## Df Sum of Sq RSS AIC F value
## <none> 2148.4 -8631.8
## Condition 7 17.57 2165.9 -8587.1 8.3684
## Type 11 134.63 2283.0 -8215.5 40.7950
## log.rooms 1 152.62 2301.0 -8138.9 508.7221
## State 15 638.74 2787.1 -6784.8 141.9381
## Energy_efficiency_class 9 33.56 2181.9 -8538.0 12.4294
## Year_built1 1 56.88 2205.2 -8445.4 189.5786
## Furnishing_quality 4 346.08 2494.4 -7562.8 288.3902
## floor1 1 31.86 2180.2 -8527.7 106.1896
## Pr(>F)
## <none>
## Condition 0.0000000003205 ***
## Type < 0.00000000000000022 ***
## log.rooms < 0.00000000000000022 ***
## State < 0.00000000000000022 ***
## Energy_efficiency_class < 0.00000000000000022 ***
## Year_built1 < 0.00000000000000022 ***
## Furnishing_quality < 0.00000000000000022 ***
## floor1 < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
By including polynomials (e.g. x1 + x1^2) we can model non linear relationships with a Linear Model.
library(ggplot2)
attach(data1)
Graphical analysis
log(Price) ~ log(Living_space)
gg.log.price_log.living <- ggplot(data1,mapping = aes(y = log.price, x = log.living)) + geom_point()
gg.log.price_log.living + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
gg.log.price_log.living <- ggplot(data1,mapping = aes(y = log.price, x = Year_built1)) + geom_point()
gg.log.price_log.living + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 666 rows containing non-finite values (stat_smooth).
## Warning: Removed 666 rows containing missing values (geom_point).
gg.log.price_log.living <- ggplot(data1,mapping = aes(y = log.price, x = log.living, colour = log.rooms)) + geom_point()
gg.log.price_log.living + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Quadratic Effects
#model with a linear effect for log.living
lm.living.1 <- lm(log.price ~ log.living + Year_built1)
drop1(lm.living.1, test = "F")
## Single term deletions
##
## Model:
## log.price ~ log.living + Year_built1
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 4106.2 -8243.4
## log.living 1 1938.49 6044.6 -4513.1 4555.2 < 0.00000000000000022 ***
## Year_built1 1 655.04 4761.2 -6816.8 1539.3 < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#model with a quadratic effect for log.living
lm.living.2 <- update(lm.living.1, . ~ . + I(log.living^2))
summary(lm.living.2)
##
## Call:
## lm(formula = log.price ~ log.living + Year_built1 + I(log.living^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6813 -0.3840 0.0215 0.4053 4.1527
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.8930634 0.5350769 -7.276 0.000000000000371 ***
## log.living 1.9999960 0.1905934 10.494 < 0.0000000000000002 ***
## Year_built1 0.0046546 0.0001207 38.553 < 0.0000000000000002 ***
## I(log.living^2) -0.1004879 0.0180993 -5.552 0.000000028983449 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6513 on 9648 degrees of freedom
## (666 observations deleted due to missingness)
## Multiple R-squared: 0.3711, Adjusted R-squared: 0.3709
## F-statistic: 1897 on 3 and 9648 DF, p-value: < 0.00000000000000022
#test in quadratic
anova(lm.living.1, lm.living.2)
## Analysis of Variance Table
##
## Model 1: log.price ~ log.living + Year_built1
## Model 2: log.price ~ log.living + Year_built1 + I(log.living^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 9649 4106.2
## 2 9648 4093.1 1 13.077 30.825 0.00000002898 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Second Model has lower RSS Value and therefore slightly better
#plot
gg.log.price_log.living + geom_smooth(method = 'lm', formula = y ~poly(x, degree = 2))
#model with a quadratic poly
lm.living.3 <- lm(log.price ~ log.rooms + poly(log.living, degree = 2))
summary(lm.living.3)
##
## Call:
## lm(formula = log.price ~ log.rooms + poly(log.living, degree = 2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9761 -0.3927 0.0714 0.4552 2.6652
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.68390 0.04616 296.42 <0.0000000000000002
## log.rooms -0.44504 0.02490 -17.88 <0.0000000000000002
## poly(log.living, degree = 2)1 59.73564 1.11071 53.78 <0.0000000000000002
## poly(log.living, degree = 2)2 -7.82809 0.70453 -11.11 <0.0000000000000002
##
## (Intercept) ***
## log.rooms ***
## poly(log.living, degree = 2)1 ***
## poly(log.living, degree = 2)2 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7009 on 10314 degrees of freedom
## Multiple R-squared: 0.2994, Adjusted R-squared: 0.2992
## F-statistic: 1469 on 3 and 10314 DF, p-value: < 0.00000000000000022
#model with a cubic poly
lm.living.4 <- lm(log.price ~ log.rooms + poly(log.living, degree = 3))
summary(lm.living.4)
##
## Call:
## lm(formula = log.price ~ log.rooms + poly(log.living, degree = 3))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9763 -0.3928 0.0714 0.4553 2.6650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.68416 0.04669 293.108 <0.0000000000000002
## log.rooms -0.44519 0.02519 -17.676 <0.0000000000000002
## poly(log.living, degree = 3)1 59.74067 1.11848 53.412 <0.0000000000000002
## poly(log.living, degree = 3)2 -7.82850 0.70465 -11.110 <0.0000000000000002
## poly(log.living, degree = 3)3 -0.02715 0.70903 -0.038 0.969
##
## (Intercept) ***
## log.rooms ***
## poly(log.living, degree = 3)1 ***
## poly(log.living, degree = 3)2 ***
## poly(log.living, degree = 3)3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.701 on 10313 degrees of freedom
## Multiple R-squared: 0.2994, Adjusted R-squared: 0.2991
## F-statistic: 1102 on 4 and 10313 DF, p-value: < 0.00000000000000022
gg.log.lot.log.price <- ggplot(data = data1, mapping = aes(y = log.lot, x = log.price)) + geom_point()
gg.log.lot.log.price + geom_smooth(method = 'gam')
library(splines)
lm.regression_splines <- lm(log.price ~ bs(log.living, df = 3))
summary(lm.regression_splines)
##
## Call:
## lm(formula = log.price ~ bs(log.living, df = 3))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8153 -0.4074 0.0784 0.4607 2.7500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.2652 0.2111 43.897 <0.0000000000000002 ***
## bs(log.living, df = 3)1 3.9273 0.4428 8.869 <0.0000000000000002 ***
## bs(log.living, df = 3)2 4.1135 0.1701 24.182 <0.0000000000000002 ***
## bs(log.living, df = 3)3 5.4793 0.4107 13.342 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7115 on 10314 degrees of freedom
## Multiple R-squared: 0.2782, Adjusted R-squared: 0.278
## F-statistic: 1325 on 3 and 10314 DF, p-value: < 0.00000000000000022
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-33. For overview type 'help("mgcv-package")'.
GAMs for log(Price) ~ s(Rooms)
attach(data1)
## The following objects are masked from data1 (pos = 6):
##
## Bathrooms, Bedrooms, City, Condition, Energy_efficiency_class,
## Energy_source, Floors, Furnishing_quality, Garages, Garagetype,
## Heating, Living_space, log.living, log.lot, log.price, log.rooms,
## Lot, Place, Price, Rooms, State, Type, Year_built, Year_renovated
## The following objects are masked from data1 (pos = 12):
##
## Bathrooms, Bedrooms, City, Condition, Energy_efficiency_class,
## Energy_source, Floors, Furnishing_quality, Garages, Garagetype,
## Heating, Living_space, log.living, log.lot, log.price, log.rooms,
## Lot, Place, Price, Rooms, State, Type, Year_built, Year_renovated
## The following objects are masked from data (pos = 13):
##
## Bathrooms, Bedrooms, City, Condition, Energy_efficiency_class,
## Energy_source, Floors, Furnishing_quality, Garages, Garagetype,
## Heating, Living_space, Lot, Place, Price, Rooms, State, Type,
## Year_built, Year_renovated
## The following objects are masked from data (pos = 14):
##
## Bathrooms, Bedrooms, City, Condition, Energy_efficiency_class,
## Energy_source, Floors, Furnishing_quality, Garages, Garagetype,
## Heating, Living_space, Lot, Place, Price, Rooms, State, Type,
## Year_built, Year_renovated
gam.log.price.rooms <- gam(log.price ~ s(log.rooms))
summary(gam.log.price.rooms)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log.price ~ s(log.rooms)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.867983 0.007783 1653 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(log.rooms) 7.888 8.623 145.8 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.108 Deviance explained = 10.9%
## GCV = 0.62562 Scale est. = 0.62509 n = 10318
plot(gam.log.price.rooms, residuals = TRUE, cex = 2)
GAMs for log(Price) ~ log(Living_space) + s(Rooms) + s(Garages)
gam.log.price.log.living <- gam(log.price ~ log.living + s(log.rooms) + s(log(Garages)))
summary(gam.log.price.log.living)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log.price ~ log.living + s(log.rooms) + s(log(Garages))
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.46447 0.13313 48.56 <0.0000000000000002 ***
## log.living 1.24132 0.02547 48.74 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(log.rooms) 4.964 6.084 67.976 < 0.0000000000000002 ***
## s(log(Garages)) 8.215 8.681 3.852 0.0000713 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.305 Deviance explained = 30.6%
## GCV = 0.42626 Scale est. = 0.4255 n = 8437
plot(gam.log.price.log.living, residuals = TRUE, select = 1)
#stroe the residuals in data1
data1$resid.price_living <- resid(lm.log.price_living)
#creat QQ-Plot
qqnorm(resid(lm.log.price_living))
qqline(resid(lm.log.price_living))
Expected value of the errors is “on zero”
ggplot(mapping = aes(y = resid(lm.log.price_living),
x = fitted(lm.log.price_living))) +
geom_abline(intercept = 0, slope = 0) + geom_point() +
geom_smooth()
probably non-linear because the smoother is not on zero
Homoscedasticity
ggplot(mapping = aes(y = abs(resid(lm.log.price_living)), x = fitted(lm.log.price_living))) +
geom_abline(intercept = 0, slope = 0) + geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
The variance of the residuals seems to be fairly constant
Cooks distance
## 1) compute Cook’s distance
cooks.dist.log.price_living <- cooks.distance(lm.log.price_living)
##
## 2) plot it
ggplot(data = data1,
mapping = aes(y = log.price, x = log.living,
colour = cooks.dist.log.price_living)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) + facet_wrap(. ~ Type)
## `geom_smooth()` using formula 'y ~ x'
plot(lm.log.price_living, which = 5)
Testing the model assumptions
plot(lm.log.price_living_type)
Bootstrap
#mean(data$Price)
##
B <- 10^3
t.mean <- c()
for(i in 1:B){
t.id <- sample(1:10318, replace = TRUE)
t.data.price <- data$Price[t.id]
t.mean[i] <- mean(t.data.price)
}
##
length(t.mean)
## [1] 1000
hist(t.mean, breaks = 50)
abline(v = mean(data$Price), col = "red")
sorted.means <- sort(t.mean)
quantile(sorted.means, probs = c(0.025, 0.975))
## 2.5% 97.5%
## 529609.2 551910.5